Simulation for comparing SSH, SSH-SSL, SSH-FL and Seurat, using both
Louvain Clustering method and SC3 clustering
Using the R Package Splatter, which uses a Gamma-Poisson model, I
simulated scRNA-seq data consisting of 200 cells x 2000 genesÂ
5 groups (clusters), with probabilities of 0.7,0.1,0.02,0.15 and
0.03, mimicking the population percentages seen in Tabula Sapiens dataÂ
Mimicks the rare cell populations seen in real data.
Clustering performance are evaluated using Adjusted Rand Index, which
compares predicted clusters with true labels.
##function for SSH
ari_ssh <- c()
ari_seurat <- c()
ari_feastfl <- c()
ari_sparse <- c()
ari_scm <- c()
ari_sc3_ssl <- c()
ari_sc3_fl <- c()
ari_sc3_seurat <- c()
ari_sc3_lasso <- c()
ari_sc3_scm <- c()
library(splatter)
library(SCMarker)
library(FEAST)
compare_met <- function(nreps, nogenes, nocells, sc3 = FALSE) {
for (i in 1:nreps) {
sim.groups3 <- splatSimulate(nGenes=nogenes, batchCells=nocells, group.prob = c(0.7,0.1,0.02,0.15,0.03), method = "groups",
verbose = FALSE)
Y <- counts(sim.groups3)
Y <- as.matrix(Y)
library(dplyr)
tot_med <- colSums(Y) %>%
median()
norm1 <- function(vec) {
vec/sum(vec)
}
Y <- apply(Y, 2, norm1)
Y <- Y * tot_med
Y <- Y + 0.1
Y <- t(Y)
Y <- log2(Y - min(Y) + 1)
tb <- tibble(cell = rownames(Y)) %>%
bind_cols(as_tibble(Y))
t <- as.matrix(tb[,-1])
rownames(t) <- tb$cell
# center for each feature
t <- scale(t, scale = FALSE)
# scale so that the overall variance is unit
t <- t/sd(as.double(t))
nperms = 3
lambda1 <- 0.0001
lambda0s <- seq(0.001, 1000, length = 100)
#
# # this is the most costly computation in RECOMBINE
#set.seed(1) ##important!!
#library(recombine)
out3 <- SHC_SSL_gapstat(x = t,
nperms = nperms,
lambda0s = lambda0s,
lambda1 = lambda1)
w2 <- out3$result$w
names(w2) <- colnames(t)
w2 <- sort(w2, decreasing = TRUE)
w2_nonzero <- w2[w2 > 0]
names <- attr(w2_nonzero, "names")
idxssl3 <- match(names, colnames(t))
data <- counts(sim.groups3)
data <- CreateSeuratObject(counts = data,
min.features = 0,
min.cells = 0)
data <- NormalizeData(data)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
sobj <- ScaleData(data, features = rownames(data))
# Perform linear dimensional reduction
##For SHC-SSL -
sobj <- RunPCA(sobj, features = rownames(data)[idxssl3])
sobj <- FindNeighbors(sobj, dims=1:20)
sobj <- FindClusters(sobj)
trueclass <- colData(sim.groups3)$Group
ari_ssh[i] <- eval_Cluster(sobj@meta.data$seurat_clusters, trueclass)["ARI"]
####compare seurat
data <- counts(sim.groups3)
library(Seurat)
data2 <- CreateSeuratObject(counts = data,
project = "seurat",
min.features = 0,
min.cells = 0)
data2 <- FindVariableFeatures(data2, selection.method = "vst", nfeatures = length(idxssl3))
all.genes <- rownames(data2)
data2 <- ScaleData(data2, features=all.genes)
# set variable genes as all discriminant markers
##
data2 <- RunPCA(data2, features=VariableFeatures(data2))
data2 <- FindNeighbors(data2, dims=1:20)
data2 <- FindClusters(data2) #resolution = 1.2) #resolution = 1.08) #0.76
trueclass <- colData(sim.groups3)$Group
ari_seurat[i] <- eval_Cluster(data2@meta.data$seurat_clusters, trueclass)["ARI"]
#LASSO sparse
shc.permute.out2 <- sparcl::HierarchicalSparseCluster.permute(x = t,
nperms = 3,
wbounds = seq(1.1, 10, len = 100))
shc.out2 <- sparcl::HierarchicalSparseCluster(x = t,
wbound = shc.permute.out2$bestw)
w <- shc.out2$ws
names(w) <- colnames(t)
w <- sort(w, decreasing = TRUE)
w_nonzero <- w[w > 0]
names <- attr(w_nonzero, "names")
idxlasso2 <- match(names, colnames(t))
data3 <- counts(sim.groups3)
data3 <- CreateSeuratObject(counts = data3,
min.features = 0,
min.cells = 0)
data3 <- NormalizeData(data3)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
data3 <- ScaleData(data3, features = rownames(data3))
# Perform linear dimensional reduction
data3 <- RunPCA(data3, features = rownames(data3)[idxlasso2])
pca <- data3[['pca']]
data3 <- FindNeighbors(data3, dims=1:min(length(pca@stdev),20))
data3 <- FindClusters(data3) #resolution = 0.76)
trueclass <- colData(sim.groups3)$Group
ari_sparse[i] <- eval_Cluster(data3@meta.data$seurat_clusters, trueclass)["ARI"]
#FEAST - FL
Y <- counts(sim.groups3)
#Y = process_Y(Y)
ixs = FEAST(Y, k=5)
t2 <- t[, ixs]
out4 <- SHC_FL_gapstat(t2,
lambda1s = seq(0, 0.5*max(nrow(t), ncol(t)), length = 5),
lambda2s = seq(1, 10*max(nrow(t), ncol(t)), length = 5),
nperms = 3)
w2 <- out4$result$w
names(w2) <- colnames(t2)
w2 <- sort(w2, decreasing = TRUE)
w2_nonzero <- w2[w2 > 0]
names <- attr(w2_nonzero, "names")
idxssl4 <- match(names, colnames(t2))
data4 <- counts(sim.groups3)
data4 <- CreateSeuratObject(counts = data4,
project = "fl",
min.features = 0,
min.cells = 0)
data4 <- NormalizeData(data4)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
data4 <- ScaleData(data4, features = rownames(data4))
# Perform linear dimensional reduction
##For SHC-SSL -
data4 <- RunPCA(data4, features = rownames(data4)[ixs][idxssl4])
data4 <- FindNeighbors(data4, dims=1:20)
data4 <- FindClusters(data4)
trueclass <- colData(sim.groups3)$Group
ari_feastfl[i] <- eval_Cluster(data4@meta.data$seurat_clusters, trueclass)["ARI"]
#SC-Marker
Y = process_Y(Y)
res=ModalFilter(data=Y,geneK=10,cellK=10,width=2)# default widt
res=GeneFilter(obj=res)
res=getMarker(obj=res,k=300,n=30)
head(res$marker)
idxssl6<-match(res$marker, rownames(Y))
data6 <- counts(sim.groups3)
data6 <- CreateSeuratObject(counts = data6,
min.features = 0,
min.cells = 0)
data6 <- NormalizeData(data6)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
data6 <- ScaleData(data6, features = rownames(data6))
# Perform linear dimensional reduction
data6 <- RunPCA(data6, features = rownames(data6)[idxssl6])
data6 <- FindNeighbors(data6, dims=1:20)
data6 <- FindClusters(data6)
trueclass <- colData(sim.groups3)$Group
ari_scm[i] <- eval_Cluster(data6@meta.data$seurat_clusters, trueclass)["ARI"]
if (sc3 == TRUE) {
sobj2 <- counts(sim.groups3)
#library(FEAST)
sc3_reslv <- SC3_Clust(sobj2, k=5, input_markers=VariableFeatures(data2))
trueclass <- colData(sim.groups3)$Group
ari_sc3_seurat[i] <- eval_Cluster(sc3_reslv$cluster, trueclass)["ARI"]
sc3_reslasso <- SC3_Clust(sobj2, k=5, input_markers=rownames(sobj2)[idxlasso2])
trueclass <- colData(sim.groups3)$Group
ari_sc3_lasso[i] <- eval_Cluster(sc3_reslasso$cluster, trueclass)["ARI"]
sc3_res <- SC3_Clust(sobj2, k=5, input_markers=rownames(sobj2)[idxssl3])
trueclass <- colData(sim.groups3)$Group
ari_sc3_ssl[i] <- eval_Cluster(sc3_res$cluster, trueclass)["ARI"]
sc3_resfl <- SC3_Clust(sobj2, k=5, input_markers=rownames(sobj2)[ixs][idxssl4])
trueclass <- colData(sim.groups3)$Group
ari_sc3_fl[i] <- eval_Cluster(sc3_resfl$cluster, trueclass)["ARI"]
sc3_resscm <- SC3_Clust(sobj2, k=5, input_markers=rownames(sobj2)[idxssl6])
ari_sc3_scm[i] <- eval_Cluster(sc3_resscm$cluster, trueclass)["ARI"]
}
}
results <- list(ari_ssh, ari_seurat, ari_feastfl, ari_sparse, ari_scm, ari_sc3_seurat, ari_sc3_lasso, ari_sc3_ssl, ari_sc3_fl, ari_sc3_scm)
return(results)
}
#doing 10 simulations
compare_met(100, 2000, 200, sc3 = TRUE)
# 6 more
Simulation results
Figure 1 : SHC-SSL and SHC-FL performed better than Seurat and SHC in
clustering data with rare cell populations (due to computation, this is
not displayed here, please refer to PPT)
Figure 2: SHC-SSL and SHC-FL similarly performed better than SHC
using SC3 clustering (due to computation, this is not displayed here,
please refer to PPT)
Comparing feature selection methods with real data - Tabula Sapiens
human kidney cells data
Challenges - 7 cell types :Â Kidney epithelial cell, B cell, CD4 T
cell, CD8 T cell, NK cell, Macropages and Endothelial cell -> more
than 8000 out of 9000 cells are kidney epithelial cells, Seurat performs
poorly in clustering rare cell populationsÂ
gap statistic
out <- readRDS("scRNA_shc_ssl_out.rds")
plot(out$w_l0norm,
out$gaps_mean,
log = "x",
xlab = "# Non-zero Features",
ylab = "Gap Statistics",
ylim = c(min(out$gaps_mean - out$gaps_se) - 0.0001,
max(out$gaps_mean + out$gaps_se) + 0.0001),
type = "l",
lwd = 1)
arrows(x0 = out$w_l0norm,
y0 = out$gaps_mean - out$gaps_se,
x1 = out$w_l0norm,
y1 = out$gaps_mean + out$gaps_se,
code = 3, angle = 90, length = 0.02, lwd = 1)

selecting the non-zero weight features for recombine
w <- out$result$w
names(w) <- colnames(mt_expr)
w <- sort(w, decreasing = TRUE)
w_nonzero <- w[w > 0]
tb <- tibble(feature = names(w),
w = w)
tb <- tb %>%
mutate(i = 1:nrow(tb)) %>%
mutate(label = ifelse(i <= 10, feature, ""))
options(ggrepel.max.overlaps = Inf)
ggline(tb, "i", "w",
xlab = "feature index",
label = "label",
repel = TRUE,
label.rectangle = TRUE,
point.size = 0.1,
plot_type = "p")

Seurat’s built in feature selection method (vst)
library(Seurat)
#normalize
kidata <- NormalizeData(kidata)
#now, use blood1 for HVF selected by Seurat, and blood2 for ones selected by FEAST
kidata <- FindVariableFeatures(kidata, selection.method = "vst", nfeatures = 2000)
all.genes <- rownames(kidata)
kidata <- ScaleData(kidata, features=all.genes)
#if using FEAST should input selected features here
kidata1 <- RunPCA(kidata, features=VariableFeatures(object=kidata))
ElbowPlot(kidata1) #use 18 PCs
kidata1 <- FindNeighbors(kidata1, dims=1:15)
kidata1 <- FindClusters(kidata1, resolution = 0.18)
trueclass <- kidata@meta.data$free_annotation
UMAP (with true labels)
#set.seed(1)
#kidata1 <- RunUMAP(kidata1, dims=1:50)
DimPlot(kidata1, reduction="umap")

# Step 1: Add true class and clustering labels to the metadata
kidata1$trueclass <- kidata1@meta.data$free_annotation
# Step 2: Extract UMAP coordinates and metadata
umap_data <- as.data.frame(Embeddings(kidata1, "umap")) # Extract UMAP embeddings
colnames(umap_data) <- c("UMAP_1", "UMAP_2") # Rename columns
umap_data$seurat_clusters <- kidata1$seurat_clusters # Add cluster labels
umap_data$trueclass <- kidata1$trueclass # Add true class labels
# Step 3: Plot UMAP with overlayed true class labels
ggplot(umap_data, aes(x = UMAP_1, y = UMAP_2)) +
geom_point(aes(color = as.factor(seurat_clusters)), alpha = 0.6, size = 2) + # Points for clusters
geom_text(aes(label = trueclass), size = 3, vjust = -1, check_overlap = TRUE) + # Text overlay for true class
labs(
color = "Cluster",
x = "UMAP 1",
y = "UMAP 2",
title = "UMAP: Clustering Results with True Class Overlay"
) +
theme_minimal() +
theme(
legend.position = "right", # Adjust legend position
plot.title = element_text(hjust = 0.5) # Center the title
)

Sparse Hierarchial Clustering (SHC)
#normalize
Y1 <- as.matrix(Y1)
library(dplyr)
tot_med <- colSums(Y1) %>%
median()
norm1 <- function(vec) {
vec/sum(vec)
}
Y1 <- apply(Y1, 2, norm1)
Y1 <- Y1 * tot_med
Y1 <- Y1 + 0.1
Y1 <- t(Y1)
Y1 <- log2(Y1 - min(Y1) + 1)
tb <- tibble(cell = rownames(Y1)) %>%
bind_cols(as_tibble(Y1))
t <- as.matrix(tb[,-1])
rownames(t) <- tb$cell
# center for each feature
t <- scale(t, scale = FALSE)
# scale so that the overall variance is unit
t <- t/sd(as.double(t))
dim(t)
View(t)
shc.permute.out <- sparcl::HierarchicalSparseCluster.permute(x = t,
nperms = 3,
wbounds = seq(1.1, 10, len = 100))
w3 <- shc.permute.out$result$w
names(w3) <- colnames(t)
w3 <- sort(w3, decreasing = TRUE)
w3_nonzero <- w3[w3 > 0]
names <- attr(w3_nonzero, "names")
idxssl4 <- match(names, rownames(Y))
length(idxssl4)
#data <- counts(sim.groups3)
library(Seurat)
data <- CreateSeuratObject(counts = Y,
min.features = 0,
min.cells = 0)
data <- NormalizeData(data)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
sobj <- ScaleData(data, features = rownames(data))
# Perform linear dimensional reduction
##For SHC-SSL -
kidata4 <- RunPCA(sobj, features = rownames(data)[idxssl4])
kidata4 <- FindNeighbors(kidata2, dims=1:20)
kidata4 <- FindClusters(kidata2, resolution=0.16)
kidata4$trueclass <- kidata1@meta.data$free_annotation
ki_shc<-eval_Cluster(kidata4@meta.data$seurat_clusters, trueclass)
kidata4 <- FindNeighbors(kidata4, dims=1:15)
kidata4 <- FindClusters(kidata4, resolution = 0.12)
trueclass <- kidata@meta.data$free_annotation
UMAP (with true labels)
set.seed(1)
kidata4 <- RunUMAP(kidata4, dims=1:50)
DimPlot(kidata4, reduction="umap")

library(ggplot2)
# Step 1: Add true class and clustering labels to the metadata
kidata4$trueclass <- kidata1@meta.data$free_annotation
# Step 2: Extract UMAP coordinates and metadata
umap_data <- as.data.frame(Embeddings(kidata4, "umap")) # Extract UMAP embeddings
colnames(umap_data) <- c("UMAP_1", "UMAP_2") # Rename columns
umap_data$seurat_clusters <- kidata4$seurat_clusters # Add cluster labels
umap_data$trueclass <- kidata4$trueclass # Add true class labels
# Step 3: Plot UMAP with overlayed true class labels
ggplot(umap_data, aes(x = UMAP_1, y = UMAP_2)) +
geom_point(aes(color = as.factor(seurat_clusters)), alpha = 0.6, size = 2) + # Points for clusters
geom_text(aes(label = trueclass), size = 3, vjust = -1, check_overlap = TRUE) + # Text overlay for true class
labs(
color = "Cluster",
x = "UMAP 1",
y = "UMAP 2",
title = "UMAP: Clustering Results with True Class Overlay"
) +
theme_minimal() +
theme(
legend.position = "right", # Adjust legend position
plot.title = element_text(hjust = 0.5) # Center the title
)

RECOMBINE SHC-SSL algorithm
#normalize
Y1 <- as.matrix(Y1)
library(dplyr)
tot_med <- colSums(Y1) %>%
median()
norm1 <- function(vec) {
vec/sum(vec)
}
Y1 <- apply(Y1, 2, norm1)
Y1 <- Y1 * tot_med
Y1 <- Y1 + 0.1
Y1 <- t(Y1)
Y1 <- log2(Y1 - min(Y1) + 1)
tb <- tibble(cell = rownames(Y1)) %>%
bind_cols(as_tibble(Y1))
t <- as.matrix(tb[,-1])
rownames(t) <- tb$cell
# center for each feature
t <- scale(t, scale = FALSE)
# scale so that the overall variance is unit
t <- t/sd(as.double(t))
dim(t)
View(t)
nperms = 3
lambda1 <- 0.0001
lambda0s <- seq(0.001, 1000, length = 100)
library(recombine)
#
# # this is the most costly computation in RECOMBINE
#set.seed(1) ##important!!
library(recombine)
out3 <- SHC_SSL_gapstat(x = t,
nperms = nperms,
lambda0s = lambda0s,
lambda1 = lambda1)
w2 <- out3$result$w
names(w2) <- colnames(t)
w2 <- sort(w2, decreasing = TRUE)
w2_nonzero <- w2[w2 > 0]
names <- attr(w2_nonzero, "names")
idxssl3 <- match(names, rownames(Y))
length(idxssl3)
#data <- counts(sim.groups3)
library(Seurat)
data <- CreateSeuratObject(counts = Y,
min.features = 0,
min.cells = 0)
data <- NormalizeData(data)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
sobj <- ScaleData(data, features = rownames(data))
# Perform linear dimensional reduction
##For SHC-SSL -
kidata2 <- RunPCA(sobj, features = rownames(data)[idxssl3])
kidata2 <- FindNeighbors(kidata2, dims=1:20)
kidata2 <- FindClusters(kidata2, resolution=0.16)
kidata2$trueclass <- kidata1@meta.data$free_annotation
ki_recombine<-eval_Cluster(kidata2@meta.data$seurat_clusters, trueclass)
UMAP (with true labels)
kidata3 <- RunUMAP(kidata3, features=rownames(data)[idxssl3])
DimPlot(kidata3, reduction="umap")

# Step 1: Add true class and clustering labels to the metadata
kidata3$trueclass <- kidata1@meta.data$free_annotation
# Step 2: Extract UMAP coordinates and metadata
umap_data <- as.data.frame(Embeddings(kidata3, "umap")) # Extract UMAP embeddings
colnames(umap_data) <- c("UMAP_1", "UMAP_2") # Rename columns
umap_data$seurat_clusters <- kidata3$seurat_clusters # Add cluster labels
umap_data$trueclass <- kidata3$trueclass # Add true class labels
# Step 3: Plot UMAP with overlayed true class labels
ggplot(umap_data, aes(x = UMAP_1, y = UMAP_2)) +
geom_point(aes(color = as.factor(seurat_clusters)), alpha = 0.6, size = 2) + # Points for clusters
geom_text(aes(label = trueclass), size = 3, vjust = -1, check_overlap = TRUE) + # Text overlay for true class
labs(
color = "Cluster",
x = "UMAP 1",
y = "UMAP 2",
title = "UMAP: Clustering Results with True Class Overlay"
) +
theme_minimal() +
theme(
legend.position = "right", # Adjust legend position
plot.title = element_text(hjust = 0.5) # Center the title
)

SCMarker algorithm
library(SCMarker)
dim(Y)
Y1<-log(Y+1)
res=ModalFilter(data=Y1,geneK=10,cellK=10,width=2)
res=GeneFilter(obj=res)
res=getMarker(obj=res,k=300,n=30)
head(res$marker)
length(res$marker) #938
UMAP (with true labels)
#kidata2 <- RunUMAP(kidata2, features=genes)
DimPlot(kidata2, reduction="umap")

# Step 1: Add true class and clustering labels to the metadata
kidata2$trueclass <- kidata1@meta.data$free_annotation
# Step 2: Extract UMAP coordinates and metadata
umap_data <- as.data.frame(Embeddings(kidata2, "umap")) # Extract UMAP embeddings
colnames(umap_data) <- c("UMAP_1", "UMAP_2") # Rename columns
umap_data$seurat_clusters <- kidata2$seurat_clusters # Add cluster labels
umap_data$trueclass <- kidata2$trueclass # Add true class labels
# Step 3: Plot UMAP with overlayed true class labels
ggplot(umap_data, aes(x = UMAP_1, y = UMAP_2)) +
geom_point(aes(color = as.factor(seurat_clusters)), alpha = 0.6, size = 2) + # Points for clusters
geom_text(aes(label = trueclass), size = 3, vjust = -1, check_overlap = TRUE) + # Text overlay for true class
labs(
color = "Cluster",
x = "UMAP 1",
y = "UMAP 2",
title = "UMAP: Clustering Results with True Class Overlay"
) +
theme_minimal() +
theme(
legend.position = "right", # Adjust legend position
plot.title = element_text(hjust = 0.5) # Center the title
)

Compare the four methods in clustering accuracy
res_compare<-cbind(ki_seurat, ki_recombine, ki_scmarker, ki_shc)
kable(res_compare, caption = "Cluster Evaluation Metrics")
Cluster Evaluation Metrics
| ARI |
0.2996862 |
0.8176945 |
0.8940461 |
0.2218051 |
| Purity |
0.9539467 |
0.9533243 |
0.9537392 |
0.9497977 |
| Jaccard |
0.4672912 |
0.9011177 |
0.9446944 |
0.3731324 |
| FM |
0.6821978 |
0.9490565 |
0.9718630 |
0.6082610 |
It is shown that the SHC-SSL feature selection algorithm outperforms
both Seurat and traditional SHC, and it is comparable to the SCMarker
algorithm.
---
title: Identifying rare cell populations and improving clustering accuracy using RECOMBINE algorithm
subtitle: Simulation and real data analysis 
author: Wanru Guo 
date: "Last compiled on `r format(Sys.time(), '%B %d, %Y')`"
output:
  html_notebook: 
    toc: yes
    toc_depth: 4
    toc_float: yes
---

# Simulation for comparing SSH, SSH-SSL, SSH-FL and Seurat, using both Louvain Clustering method and SC3 clustering 
Using the R Package Splatter, which uses a Gamma-Poisson model, I simulated scRNA-seq data consisting of 200 cells x 2000 genes 

5 groups (clusters), with probabilities of 0.7,0.1,0.02,0.15 and 0.03, mimicking the population percentages seen in Tabula Sapiens data 
Mimicks the rare cell populations seen in real data. 

Clustering performance are evaluated using Adjusted Rand Index, which compares predicted clusters with true labels. 

```{r}
##function for SSH 
ari_ssh <- c() 
ari_seurat <- c() 
ari_feastfl <- c() 
ari_sparse <- c() 
ari_scm <- c() 
ari_sc3_ssl <- c() 
ari_sc3_fl <- c() 
ari_sc3_seurat <- c() 
ari_sc3_lasso <- c() 
ari_sc3_scm <- c() 

library(splatter)
library(SCMarker)
library(FEAST)
compare_met <- function(nreps, nogenes, nocells, sc3 = FALSE) { 
for (i in 1:nreps) { 
sim.groups3 <- splatSimulate(nGenes=nogenes, batchCells=nocells, group.prob = c(0.7,0.1,0.02,0.15,0.03), method = "groups",
                             verbose = FALSE) 

Y <- counts(sim.groups3) 

Y <- as.matrix(Y)
library(dplyr)
tot_med <- colSums(Y) %>%
  median()
norm1 <- function(vec) {
  vec/sum(vec)
}
Y <- apply(Y, 2, norm1)
Y <- Y * tot_med
Y <- Y + 0.1
Y <- t(Y) 

Y <- log2(Y - min(Y) + 1)
tb <- tibble(cell = rownames(Y)) %>%
  bind_cols(as_tibble(Y)) 

t <- as.matrix(tb[,-1])
rownames(t) <- tb$cell

# center for each feature
t <- scale(t, scale = FALSE)

# scale so that the overall variance is unit 
t <- t/sd(as.double(t)) 

nperms = 3 
lambda1 <- 0.0001
lambda0s <- seq(0.001, 1000, length = 100)
# 
# # this is the most costly computation in RECOMBINE
#set.seed(1)   ##important!! 
#library(recombine)
out3 <- SHC_SSL_gapstat(x = t,
                        nperms = nperms,
                        lambda0s = lambda0s,
                        lambda1 = lambda1)


w2 <- out3$result$w

names(w2) <- colnames(t)

w2 <- sort(w2, decreasing = TRUE)
w2_nonzero <- w2[w2 > 0]

names <- attr(w2_nonzero, "names")
idxssl3 <- match(names, colnames(t))

data <- counts(sim.groups3) 
data <- CreateSeuratObject(counts = data,
                           min.features = 0,
                           min.cells = 0)
data <- NormalizeData(data)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
sobj <- ScaleData(data, features = rownames(data)) 
# Perform linear dimensional reduction

##For SHC-SSL - 
sobj <- RunPCA(sobj, features = rownames(data)[idxssl3]) 

sobj <- FindNeighbors(sobj, dims=1:20)  
sobj <- FindClusters(sobj) 

trueclass <- colData(sim.groups3)$Group  
ari_ssh[i] <- eval_Cluster(sobj@meta.data$seurat_clusters, trueclass)["ARI"]

####compare seurat 
data <- counts(sim.groups3)
library(Seurat)
data2 <- CreateSeuratObject(counts = data,
                            project = "seurat",
                            min.features = 0,
                            min.cells = 0)
data2 <- FindVariableFeatures(data2, selection.method = "vst", nfeatures = length(idxssl3)) 

all.genes <- rownames(data2) 
data2 <- ScaleData(data2, features=all.genes)
# set variable genes as all discriminant markers
##
data2 <- RunPCA(data2, features=VariableFeatures(data2)) 

data2 <- FindNeighbors(data2, dims=1:20)  
data2 <- FindClusters(data2) #resolution = 1.2) #resolution = 1.08)  #0.76 
trueclass <- colData(sim.groups3)$Group  
ari_seurat[i] <- eval_Cluster(data2@meta.data$seurat_clusters, trueclass)["ARI"]

#LASSO sparse 
shc.permute.out2 <- sparcl::HierarchicalSparseCluster.permute(x = t,
                                                              nperms = 3,
                                                              wbounds = seq(1.1, 10, len = 100))
shc.out2 <- sparcl::HierarchicalSparseCluster(x = t,
                                              wbound = shc.permute.out2$bestw)

w <- shc.out2$ws
names(w) <- colnames(t)
w <- sort(w, decreasing = TRUE)
w_nonzero <- w[w > 0]

names <- attr(w_nonzero, "names")
idxlasso2 <- match(names, colnames(t))

data3 <- counts(sim.groups3) 
data3 <- CreateSeuratObject(counts = data3,
                            min.features = 0,
                            min.cells = 0)
data3 <- NormalizeData(data3)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
data3 <- ScaleData(data3, features = rownames(data3))
# Perform linear dimensional reduction

data3 <- RunPCA(data3, features = rownames(data3)[idxlasso2]) 

pca <- data3[['pca']]  

data3 <- FindNeighbors(data3, dims=1:min(length(pca@stdev),20))   

data3 <- FindClusters(data3) #resolution = 0.76)  
trueclass <- colData(sim.groups3)$Group  
ari_sparse[i] <- eval_Cluster(data3@meta.data$seurat_clusters, trueclass)["ARI"]

#FEAST - FL 
Y <- counts(sim.groups3)
#Y = process_Y(Y) 
ixs = FEAST(Y, k=5)
t2 <- t[, ixs] 

out4 <- SHC_FL_gapstat(t2,
                       lambda1s = seq(0, 0.5*max(nrow(t), ncol(t)), length = 5),
                       lambda2s = seq(1, 10*max(nrow(t), ncol(t)), length = 5),
                       nperms = 3)

w2 <- out4$result$w

names(w2) <- colnames(t2)
w2 <- sort(w2, decreasing = TRUE)
w2_nonzero <- w2[w2 > 0]

names <- attr(w2_nonzero, "names")
idxssl4 <- match(names, colnames(t2))

data4 <- counts(sim.groups3) 
data4 <- CreateSeuratObject(counts = data4,
                           project = "fl",
                           min.features = 0,
                           min.cells = 0)
data4 <- NormalizeData(data4)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
data4 <- ScaleData(data4, features = rownames(data4)) 
# Perform linear dimensional reduction

##For SHC-SSL - 
data4 <- RunPCA(data4, features = rownames(data4)[ixs][idxssl4]) 

data4 <- FindNeighbors(data4, dims=1:20)  
data4 <- FindClusters(data4) 

trueclass <- colData(sim.groups3)$Group 
ari_feastfl[i] <- eval_Cluster(data4@meta.data$seurat_clusters, trueclass)["ARI"]

#SC-Marker 
Y = process_Y(Y) 
res=ModalFilter(data=Y,geneK=10,cellK=10,width=2)# default widt
res=GeneFilter(obj=res)
res=getMarker(obj=res,k=300,n=30)  
head(res$marker)
idxssl6<-match(res$marker, rownames(Y)) 
data6 <- counts(sim.groups3) 
data6 <- CreateSeuratObject(counts = data6,
                            min.features = 0,
                            min.cells = 0)
data6 <- NormalizeData(data6)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
data6 <- ScaleData(data6, features = rownames(data6)) 
# Perform linear dimensional reduction

data6 <- RunPCA(data6, features = rownames(data6)[idxssl6]) 

data6 <- FindNeighbors(data6, dims=1:20)  
data6 <- FindClusters(data6) 

trueclass <- colData(sim.groups3)$Group 
ari_scm[i] <- eval_Cluster(data6@meta.data$seurat_clusters, trueclass)["ARI"]

if (sc3 == TRUE) {
  sobj2 <- counts(sim.groups3)
  #library(FEAST)
  sc3_reslv <- SC3_Clust(sobj2, k=5, input_markers=VariableFeatures(data2)) 
  trueclass <- colData(sim.groups3)$Group  
  ari_sc3_seurat[i] <- eval_Cluster(sc3_reslv$cluster, trueclass)["ARI"]
  
  sc3_reslasso <- SC3_Clust(sobj2, k=5, input_markers=rownames(sobj2)[idxlasso2]) 
  trueclass <- colData(sim.groups3)$Group  
  ari_sc3_lasso[i] <- eval_Cluster(sc3_reslasso$cluster, trueclass)["ARI"]
  
  sc3_res <- SC3_Clust(sobj2, k=5, input_markers=rownames(sobj2)[idxssl3]) 
  trueclass <- colData(sim.groups3)$Group  
  ari_sc3_ssl[i] <- eval_Cluster(sc3_res$cluster, trueclass)["ARI"]
  
  sc3_resfl <- SC3_Clust(sobj2, k=5, input_markers=rownames(sobj2)[ixs][idxssl4]) 
  trueclass <- colData(sim.groups3)$Group  
  ari_sc3_fl[i] <- eval_Cluster(sc3_resfl$cluster, trueclass)["ARI"]
  
  sc3_resscm <- SC3_Clust(sobj2, k=5, input_markers=rownames(sobj2)[idxssl6]) 
  ari_sc3_scm[i] <- eval_Cluster(sc3_resscm$cluster, trueclass)["ARI"]
    } 
} 
results <- list(ari_ssh, ari_seurat, ari_feastfl, ari_sparse, ari_scm, ari_sc3_seurat, ari_sc3_lasso, ari_sc3_ssl, ari_sc3_fl, ari_sc3_scm)   
return(results) 
} 

#doing 10 simulations 
compare_met(100, 2000, 200, sc3 = TRUE) 

# 6 more 

```

## Simulation results 
Figure 1 : SHC-SSL and SHC-FL performed better than Seurat and SHC in clustering data with rare cell populations  (due to computation, this is not displayed here, please refer to PPT) 

Figure 2: SHC-SSL and SHC-FL similarly performed better than SHC using SC3 clustering  (due to computation, this is not displayed here, please refer to PPT) 


# Comparing feature selection methods with real data - Tabula Sapiens human kidney cells data 

Challenges - 
7 cell types : Kidney epithelial cell, B cell, CD4 T cell, CD8 T cell, NK cell, Macropages and Endothelial cell 
-> more than 8000 out of 9000 cells are kidney epithelial cells, Seurat performs poorly in clustering rare cell populations 

gap statistic 

```{r}
out <- readRDS("scRNA_shc_ssl_out.rds") 
plot(out$w_l0norm,
     out$gaps_mean,
     log = "x",
     xlab = "# Non-zero Features",
     ylab = "Gap Statistics",
     ylim = c(min(out$gaps_mean - out$gaps_se) - 0.0001,
              max(out$gaps_mean + out$gaps_se) + 0.0001),
     type = "l",
     lwd = 1)
arrows(x0 = out$w_l0norm,
       y0 = out$gaps_mean - out$gaps_se,
       x1 = out$w_l0norm,
       y1 = out$gaps_mean + out$gaps_se,
       code = 3, angle = 90, length = 0.02, lwd = 1)
```

selecting the non-zero weight features for recombine 
```{r}
w <- out$result$w
names(w) <- colnames(mt_expr)

w <- sort(w, decreasing = TRUE)
w_nonzero <- w[w > 0]

tb <- tibble(feature = names(w),
             w = w)
tb <- tb %>%
  mutate(i = 1:nrow(tb)) %>%
  mutate(label = ifelse(i <= 10, feature, ""))

options(ggrepel.max.overlaps = Inf)
ggline(tb, "i", "w",
            xlab = "feature index",
            label = "label",
            repel = TRUE,
            label.rectangle = TRUE,
            point.size = 0.1,
            plot_type = "p")
```

## Seurat's built in feature selection method (vst)
```{r}
library(Seurat)
#normalize 
kidata <- NormalizeData(kidata)

#now, use blood1 for HVF selected by Seurat, and blood2 for ones selected by FEAST
kidata <- FindVariableFeatures(kidata, selection.method = "vst", nfeatures = 2000)

all.genes <- rownames(kidata) 
kidata <- ScaleData(kidata, features=all.genes)
#if using FEAST should input selected features here 
kidata1 <- RunPCA(kidata, features=VariableFeatures(object=kidata))
```

```{r}
ElbowPlot(kidata1) #use 18 PCs 
```

```{r}
kidata1 <- FindNeighbors(kidata1, dims=1:15)

kidata1 <- FindClusters(kidata1, resolution = 0.18) 
trueclass <- kidata@meta.data$free_annotation
```

### evaluating the performance of Seurat's built in feature selection method (vst)
```{r}
library(FEAST)
ki_seurat<-eval_Cluster(kidata1@meta.data$seurat_clusters, trueclass) 

library(knitr)
kable(ki_seurat, caption = "Cluster Evaluation Metrics")
```
### UMAP (with true labels) 
```{r}
set.seed(1)
kidata1 <- RunUMAP(kidata1, dims=1:50) 
DimPlot(kidata1, reduction="umap")
```


```{r}
# Step 1: Add true class and clustering labels to the metadata
kidata1$trueclass <- kidata1@meta.data$free_annotation

# Step 2: Extract UMAP coordinates and metadata
umap_data <- as.data.frame(Embeddings(kidata1, "umap"))  # Extract UMAP embeddings
colnames(umap_data) <- c("UMAP_1", "UMAP_2")             # Rename columns
umap_data$seurat_clusters <- kidata1$seurat_clusters      # Add cluster labels
umap_data$trueclass <- kidata1$trueclass                  # Add true class labels

# Step 3: Plot UMAP with overlayed true class labels
ggplot(umap_data, aes(x = UMAP_1, y = UMAP_2)) +
  geom_point(aes(color = as.factor(seurat_clusters)), alpha = 0.6, size = 2) + # Points for clusters
  geom_text(aes(label = trueclass), size = 3, vjust = -1, check_overlap = TRUE) + # Text overlay for true class
  labs(
    color = "Cluster", 
    x = "UMAP 1",
    y = "UMAP 2",
    title = "UMAP: Clustering Results with True Class Overlay"
  ) +
  theme_minimal() +
  theme(
    legend.position = "right",          # Adjust legend position
    plot.title = element_text(hjust = 0.5)  # Center the title
  )
```



## Sparse Hierarchial Clustering (SHC) 
```{r}
#normalize 
Y1 <- as.matrix(Y1)

library(dplyr)
tot_med <- colSums(Y1) %>%
  median()
norm1 <- function(vec) {
  vec/sum(vec)
}
Y1 <- apply(Y1, 2, norm1)
Y1 <- Y1 * tot_med
Y1 <- Y1 + 0.1
Y1 <- t(Y1) 

Y1 <- log2(Y1 - min(Y1) + 1)
tb <- tibble(cell = rownames(Y1)) %>%
  bind_cols(as_tibble(Y1)) 

t <- as.matrix(tb[,-1])
rownames(t) <- tb$cell

# center for each feature
t <- scale(t, scale = FALSE)

# scale so that the overall variance is unit 
t <- t/sd(as.double(t)) 
dim(t)
View(t)

shc.permute.out <- sparcl::HierarchicalSparseCluster.permute(x = t,
                                                             nperms = 3,
                                                             wbounds = seq(1.1, 10, len = 100))


w3 <- shc.permute.out$result$w

names(w3) <- colnames(t)

w3 <- sort(w3, decreasing = TRUE)
w3_nonzero <- w3[w3 > 0]

names <- attr(w3_nonzero, "names")
idxssl4 <- match(names, rownames(Y))
length(idxssl4) 
#data <- counts(sim.groups3) 
library(Seurat)

data <- CreateSeuratObject(counts = Y,
                           min.features = 0,
                           min.cells = 0)
data <- NormalizeData(data)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
sobj <- ScaleData(data, features = rownames(data)) 
# Perform linear dimensional reduction

##For SHC-SSL - 
kidata4 <- RunPCA(sobj, features = rownames(data)[idxssl4]) 

kidata4 <- FindNeighbors(kidata2, dims=1:20)  
kidata4 <- FindClusters(kidata2, resolution=0.16)

kidata4$trueclass <- kidata1@meta.data$free_annotation

ki_shc<-eval_Cluster(kidata4@meta.data$seurat_clusters, trueclass)
```

```{r}
kidata4 <- FindNeighbors(kidata4, dims=1:15)
```

```{r}
kidata4 <- FindClusters(kidata4, resolution = 0.12) 
trueclass <- kidata@meta.data$free_annotation
```

### evaluating performance with SHC 
```{r}
library(FEAST)
ki_shc<-eval_Cluster(kidata4@meta.data$seurat_clusters, trueclass) 

library(knitr)
kable(ki_shc, caption = "Cluster Evaluation Metrics")
```

### UMAP (with true labels) 
```{r}
set.seed(1)
kidata4 <- RunUMAP(kidata4, dims=1:50) 
```

```{r}
DimPlot(kidata4, reduction="umap") 
```


```{r}
library(ggplot2)
# Step 1: Add true class and clustering labels to the metadata
kidata4$trueclass <- kidata1@meta.data$free_annotation

# Step 2: Extract UMAP coordinates and metadata
umap_data <- as.data.frame(Embeddings(kidata4, "umap"))  # Extract UMAP embeddings
colnames(umap_data) <- c("UMAP_1", "UMAP_2")             # Rename columns
umap_data$seurat_clusters <- kidata4$seurat_clusters      # Add cluster labels
umap_data$trueclass <- kidata4$trueclass                  # Add true class labels

# Step 3: Plot UMAP with overlayed true class labels
ggplot(umap_data, aes(x = UMAP_1, y = UMAP_2)) +
  geom_point(aes(color = as.factor(seurat_clusters)), alpha = 0.6, size = 2) + # Points for clusters
  geom_text(aes(label = trueclass), size = 3, vjust = -1, check_overlap = TRUE) + # Text overlay for true class
  labs(
    color = "Cluster", 
    x = "UMAP 1",
    y = "UMAP 2",
    title = "UMAP: Clustering Results with True Class Overlay"
  ) +
  theme_minimal() +
  theme(
    legend.position = "right",          # Adjust legend position
    plot.title = element_text(hjust = 0.5)  # Center the title
  )
```


## RECOMBINE SHC-SSL algorithm 
```{r}
#normalize 
Y1 <- as.matrix(Y1)

library(dplyr)
tot_med <- colSums(Y1) %>%
  median()
norm1 <- function(vec) {
  vec/sum(vec)
}
Y1 <- apply(Y1, 2, norm1)
Y1 <- Y1 * tot_med
Y1 <- Y1 + 0.1
Y1 <- t(Y1) 

Y1 <- log2(Y1 - min(Y1) + 1)
tb <- tibble(cell = rownames(Y1)) %>%
  bind_cols(as_tibble(Y1)) 

t <- as.matrix(tb[,-1])
rownames(t) <- tb$cell

# center for each feature
t <- scale(t, scale = FALSE)

# scale so that the overall variance is unit 
t <- t/sd(as.double(t)) 
dim(t)
View(t)

nperms = 3 
lambda1 <- 0.0001
lambda0s <- seq(0.001, 1000, length = 100)
library(recombine) 
# 
# # this is the most costly computation in RECOMBINE
#set.seed(1)   ##important!! 
library(recombine)
out3 <- SHC_SSL_gapstat(x = t,
                        nperms = nperms,
                        lambda0s = lambda0s,
                        lambda1 = lambda1)


w2 <- out3$result$w

names(w2) <- colnames(t)

w2 <- sort(w2, decreasing = TRUE)
w2_nonzero <- w2[w2 > 0]

names <- attr(w2_nonzero, "names")
idxssl3 <- match(names, rownames(Y))
length(idxssl3)
#data <- counts(sim.groups3) 
library(Seurat)

data <- CreateSeuratObject(counts = Y,
                           min.features = 0,
                           min.cells = 0)
data <- NormalizeData(data)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
sobj <- ScaleData(data, features = rownames(data)) 
# Perform linear dimensional reduction

##For SHC-SSL - 
kidata2 <- RunPCA(sobj, features = rownames(data)[idxssl3]) 

kidata2 <- FindNeighbors(kidata2, dims=1:20)  
kidata2 <- FindClusters(kidata2, resolution=0.16)

kidata2$trueclass <- kidata1@meta.data$free_annotation

ki_recombine<-eval_Cluster(kidata2@meta.data$seurat_clusters, trueclass)
```

### evaluating SHC-SSL performance 

```{r}
kable(ki_recombine, caption = "Cluster Evaluation Metrics")  
```
### UMAP (with true labels) 

```{r}
kidata3 <- RunUMAP(kidata3, features=rownames(data)[idxssl3]) 
```

```{r}
DimPlot(kidata3, reduction="umap") 
```


```{r}
# Step 1: Add true class and clustering labels to the metadata
kidata3$trueclass <- kidata1@meta.data$free_annotation

# Step 2: Extract UMAP coordinates and metadata
umap_data <- as.data.frame(Embeddings(kidata3, "umap"))  # Extract UMAP embeddings
colnames(umap_data) <- c("UMAP_1", "UMAP_2")             # Rename columns
umap_data$seurat_clusters <- kidata3$seurat_clusters      # Add cluster labels
umap_data$trueclass <- kidata3$trueclass                  # Add true class labels

# Step 3: Plot UMAP with overlayed true class labels
ggplot(umap_data, aes(x = UMAP_1, y = UMAP_2)) +
  geom_point(aes(color = as.factor(seurat_clusters)), alpha = 0.6, size = 2) + # Points for clusters
  geom_text(aes(label = trueclass), size = 3, vjust = -1, check_overlap = TRUE) + # Text overlay for true class
  labs(
    color = "Cluster", 
    x = "UMAP 1",
    y = "UMAP 2",
    title = "UMAP: Clustering Results with True Class Overlay"
  ) +
  theme_minimal() +
  theme(
    legend.position = "right",          # Adjust legend position
    plot.title = element_text(hjust = 0.5)  # Center the title
  ) 
```

## SCMarker algorithm 
```{r}
library(SCMarker)
dim(Y) 
Y1<-log(Y+1) 

res=ModalFilter(data=Y1,geneK=10,cellK=10,width=2) 
res=GeneFilter(obj=res)
res=getMarker(obj=res,k=300,n=30) 
head(res$marker) 
length(res$marker) #938 
```

### evaluating performance of SCMarker  
```{r}
kidata3 <- RunPCA(kidata2, features = res$marker) 

kidata3 <- FindNeighbors(kidata3, dims=1:20)  
kidata3 <- FindClusters(kidata3, resolution = 0.12)
kidata3$trueclass <- kidata1@meta.data$free_annotation
ki_scmarker<-eval_Cluster(kidata3@meta.data$seurat_clusters, kidata3$trueclass)
```

```{r}
#kidata2$trueclass <- kidata1@meta.data$free_annotation
#ki_recombine<-eval_Cluster(kidata2@meta.data$seurat_clusters, trueclass) 
kable(ki_scmarker, caption = "Cluster Evaluation Metrics") 
```

### UMAP (with true labels)
```{r}
#kidata2 <- RunUMAP(kidata2, features=genes) 
DimPlot(kidata2, reduction="umap") 
```

```{r}
# Step 1: Add true class and clustering labels to the metadata
kidata2$trueclass <- kidata1@meta.data$free_annotation

# Step 2: Extract UMAP coordinates and metadata
umap_data <- as.data.frame(Embeddings(kidata2, "umap"))  # Extract UMAP embeddings
colnames(umap_data) <- c("UMAP_1", "UMAP_2")             # Rename columns
umap_data$seurat_clusters <- kidata2$seurat_clusters      # Add cluster labels
umap_data$trueclass <- kidata2$trueclass                  # Add true class labels

# Step 3: Plot UMAP with overlayed true class labels
ggplot(umap_data, aes(x = UMAP_1, y = UMAP_2)) +
  geom_point(aes(color = as.factor(seurat_clusters)), alpha = 0.6, size = 2) + # Points for clusters
  geom_text(aes(label = trueclass), size = 3, vjust = -1, check_overlap = TRUE) + # Text overlay for true class
  labs(
    color = "Cluster", 
    x = "UMAP 1",
    y = "UMAP 2",
    title = "UMAP: Clustering Results with True Class Overlay"
  ) +
  theme_minimal() +
  theme(
    legend.position = "right",          # Adjust legend position
    plot.title = element_text(hjust = 0.5)  # Center the title
  ) 
```


## Compare the four methods in clustering accuracy 
```{r}
res_compare<-cbind(ki_seurat, ki_recombine, ki_scmarker, ki_shc) 
kable(res_compare, caption = "Cluster Evaluation Metrics") 
```

It is shown that the SHC-SSL feature selection algorithm outperforms both Seurat and traditional SHC, and it is comparable to the SCMarker algorithm. 




